Loading required libraries
library(gapminder)
library(pacman)
pacman::p_load(data.table, fixest, stargazer, dplyr, magrittr)
library("ggplot2")
library(gganimate)
Gapminder data set :
head(gapminder)
## # A tibble: 6 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
Y=gapminder$lifeExp
X=gapminder$pop
nrow(gapminder)
## [1] 1704
#Description:
1704 observations; fills a size niche between iris (150 rows) and the likes of diamonds (54K rows) 6 variables country a factor with 142 levels continent, a factor with 5 levels year: going from 1952 to 2007 in increments of 5 years pop: population gdpPercap: GDP per capita lifeExp: life expectancy
Transition through distinct states in time
p <- ggplot(
gapminder,
aes(x = pop, y=lifeExp, size = gdpPercap, colour = country)
) +
geom_point(show.legend = FALSE, alpha = 0.7) +
scale_color_viridis_d() +
scale_size(range = c(2, 12)) +
scale_x_log10() +
labs(x = "POP", y = "Life expectancy")
p
p + transition_time(year) +
labs(title = "Year: {frame_time}")
Let the view follow the data in each frame
p + facet_wrap(~continent) +
transition_time(year) +
labs(title = "Year: {frame_time}")
Linear regression using lm function:
lrm.fit=lm(Y~X)
lrm.fit$coef
## (Intercept) X
## 5.924048e+01 7.903656e-09
#Montecarlo simulation
In statistics, ordinary least squares (OLS) is a type of linear least squares method for estimating the unknown parameters in a linear regression model. OLS chooses the parameters of a linear function of a set of explanatory variables by the principle of least squares: minimizing the sum of the squares of the differences between the observed dependent variable (values of the variable being observed) in the given dataset and those predicted by the linear function of the independent variable.
#parameters
Beta0=6#True value of intercept
Beta1=0.00005 #True value of slope
n=1704 #Sample Size
# Pesedo code for 5000 simulation
N=5000 #Number of replications
set.seed(1234)#to reproduce the same result
int.est=numeric(N)#empty vector to store the intercept
slp.est=numeric(N)#empty vector to store the slope
slope_DT=numeric(N)
intercept_DT=numeric(N)
for (i in 1:N){
Y=Beta0+Beta1*X+rnorm(n,0,5)
lrm.fit=lm(Y~X)
data_i = data.table(Y = Y, X = X)
#store intercept for each replication
int.est[[i]]=as.vector(lrm.fit$coef[1])
#store slope for each replication
slp.est[[i]]=as.vector(lrm.fit$coef[2])
ols_i <- fixest::feols(data = data_i, Y ~ X)
# Extract slope coefficient and save
slope_DT[i] <- ols_i$coefficients[2]
intercept_DT[i] <- ols_i$coefficients[1]
}
Summary statistics using OLS
estimates_DT <- data.table(beta_1 = slope_DT, beta_0 = intercept_DT)
stargazer(estimates_DT[, c("beta_1", "beta_0")], type = "text")
##
## ================================================================
## Statistic N Mean St. Dev. Min Pctl(25) Pctl(75) Max
## ----------------------------------------------------------------
## beta_1 5,000 0.0001 0.000 0.00005 0.00005 0.0001 0.0001
## beta_0 5,000 5.998 0.127 5.500 5.912 6.082 6.454
## ----------------------------------------------------------------